* 1.0.0 Upwards
* 1.2.0 TC 4 march 2008
* Author: Thomas Cornelissen, University of Hannover, Germany
* Incorporates the grouping algorithm of Robert Creecy (original author), Lars Vilhuber (current author), Amine Ouazad (Stata port)
program define felsdv_group,
version 9

syntax varlist(min=1) [if] [in], Ivar(varname) Jvar(varname) Group(name)
set more off

if wordcount("`varlist'")==1 & "`cons'"!="cons"{
	di in red "If you specify no explicit regressorr you must add the option 'cons'."
	exit(322)
	}

timer clear
timer on 20
tempvar itemp jtemp jtemp2 miss sample n firmnum feffgbar f p pf gmax jmin moverp firm mnumcat cltemp mover mnum pobs
tempvar indid unitid /*Grouping*/
tempfile tempgroupfile tempdatafile /*Grouping*/

marksample touse

set varabbrev off
capture drop `group'

qui gen `group'=.

sort `ivar' `jvar'

timer on 1
qui egen `miss'=rowmiss(`varlist' `ivar' `jvar')
qui gen `sample'=0
qui replace `sample'=1 if `touse' & `miss'==0

tokenize "`varlist'"
local depvar "`1'"
_rmdcoll `varlist' if `sample'==1
local varlist `depvar' `r(varlist)'
tokenize "`varlist'"
local depvar "`1'"
mac shift
local indepvar "`*'"

qui gen `mover'=.
qui gen `n'=_n if `sample'==1
timer off 1

timer on 2
preserve
qui keep if `sample'==1
qui keep `ivar' `jvar' `varlist' `orig' `sample' `n' `group'
timer off 2


timer on 4
qui egen `itemp'=group(`ivar') if `sample'==1
qui egen `jtemp2'=group(`jvar') if `sample'==1
timer off 4

timer on 5
sort `itemp' `jtemp2'
timer off 5

timer on 6
qui by `itemp': gen byte `p'=1 if _n==1
qui by `itemp': gen `pobs'=_N
qui by `itemp' `jtemp2': gen `pf'=1 if _n==1
qui sum `pf'
by `itemp': egen `firmnum'=sum(`pf')
gen `mover'=(`firmnum'>1)
qui bysort `jtemp2': egen `mnum'=sum(`mover')
qui bysort `jtemp2': gen `f'=1 if _n==1
qui sum `mnum'

if r(mean)==0 {
	di in red "There are no movers in the sample. No firm effects can be identified."
	error 322
	}
timer off 6

timer on 7
qui replace `jtemp2'=0 if `mnum'==0
qui egen `jtemp'=group(`jtemp2') if `sample'==1
qui drop `jtemp2'
timer off 7

	timer on 8
	qui drop `group'
*---------------------------GROUPING START
global ORIGAUTHOR     = "Robert Creecy"
global CURRENTAUTHOR  = "Lars Vilhuber, STATA port by Amine Ouazad"
global VERSION	    = "0.1"

quietly {

save "`tempdatafile'", replace

	keep if `mnum'>0
	keep `itemp' `jtemp'

	egen `indid'  = group(`itemp')
	egen `unitid' = group(`jtemp')

	keep `itemp' `jtemp'  `indid' `unitid' 

	***** Keeps only cells : we are working with cells, not observations
	duplicates drop  `indid' `unitid', force

	mata: groups( "`indid'","`unitid'","`group'")

	*** Drop new school and pupil indexes
	keep `itemp' `jtemp' `group'
	sort `itemp' `jtemp'
	save "`tempgroupfile'", replace

	*** Merge group information with main input data file

use "`tempdatafile'"

	
	sort `itemp' `jtemp'
	merge `itemp' `jtemp' using "`tempgroupfile'"
	drop _merge
	}


/* --------------------------------------------------------------------------- */

qui replace `group'=0 if `mnum'==0

	sort `group'
	qui gen `moverp'=`mover' if `p'==1
	sort `jtemp'
	qui by `jtemp': gen `firm'=1 if _n==1
	local last=0
	qui sum `firm'
	local firms=r(N)


di in yellow _newline "Groups of firms connected by worker mobility:"
di  _newline "             Person-years       Persons          Movers         Firms"
table `group', c(N `itemp' N `p' sum `moverp' N `f') row
qui sum `f'
local firms=r(N)
qui sum `f' if `group'==0
local nomov=r(N)
qui sum `group'
	if r(min)==0 {
		di _newline "Note: Group 0 in the table regroups firms without movers."
		di _newline "No firm effect in group 0 is identified."
		di `firms' "-" `nomov' "-" `r(max)' " = " `firms'-`nomov'-`r(max)' " firm effects are identified."
		di "(number of firms - number of firms without movers - number of groups excl. group 0)"  _newline
		}
	else {
		di _newline "Note: Each firm has at least 1 mover."
		di `firms' "-" `r(max)' " = " `firms'-`r(max)' " firm effects are identified."
		di "Computed as: number of firms - number of groups" _newline
		}
*---------------------------GROUPING END
	timer off 8


mata: save1("`group'")


end

mata:
void groups(string scalar individualid, string scalar unitid, string scalar groupvar) {

		real scalar ncells, npers, nfirm; 	/* Data size : number of distinct observations number of pupils number of schools */
		real matrix byp, byf; 			/* Dataset sorted by pupil/by school */
		real vector pg, fg, pindex, findex, ptraced, ftraced, ponstack, fonstack;
	
		/***** Stack for tracing elements */
		real vector m;				/* Stack of pupils/schools */
		real scalar mpoint;			/* Number of elements on the stack */
	
		real scalar nptraced, nftraced;	// Number of traced elements
		real scalar lasttenp, lasttenf;
		real scalar nextfirm;

		real vector mtype; 	/* Type of the element on top of the stack */
					 	/* Convention : 					 */
					 	/* 1 for a pupil					 */
					 	/* 2 for a school					 */
	
		real scalar g;	/* Current group */
	
		real scalar j;
	
		real matrix data;		/* A data view used to add group information after the algorithm completed */
	
		printf("Grouping algorithm for CG\n");
	
		/****** Core data : cells sorted by person/by firm */

		byp = st_data(., (individualid, unitid));
		printf("Sorting data by pupil id\n");
		byp = sort(byp,1);
	
		byf = st_data(., (individualid, unitid));
		printf("Sorting data by school id\n");
		byf = sort(byf,2);
		
		/****** Data size */
	
		ncells = rows(byf);		/* Number of distinct observations (duplicates drop has to be done beforehand) */
		npers  = byp[ncells,1];		/* Number of pupils										 */
		nfirm  = byf[ncells,2];		/* Number of schools										 */

		printf("Data size : %9.0g cells, %9.0g pupils, %9.0g firms\n", ncells, npers, nfirm);
	
		/****** Initializing the stack and p/ftraced */

		printf("Initializing the stack\n");
	
		ptraced  = J(npers, 1, 0);	// No pupil has been traced yet
		ftraced  = J(nfirm, 1, 0);	// No school has been traced yet

		ponstack = J(npers, 1, 0);	// No pupil has been on the stack yet
		fonstack = J(nfirm, 1, 0);	// No school has been on the stack yet
	
		m 	= J(npers+nfirm, 1, 0); // Empty stack
		mtype = J(npers+nfirm, 1, 0);	// Unknown type of the element on top of the stack
	
		printf("Initializing pg,fg\n");
	
		pg	= J(npers, 1, 0);
		fg	= J(nfirm, 1, 0);
	
		/****** Initializing pindex, findex */
	
		printf("Initializing the index arrays\n");
	
		pindex = J(npers, 1, 0);
		findex = J(nfirm, 1, 0);
	
		for ( j = 1 ; j <= ncells ; j++) {
			pindex[byp[j,1]] = j;
			findex[byf[j,2]] = j;
		}
	
		g = 1;   	// The first group is group 1
		
		check_data(byp, byf, ncells);
	
		/***** Puts the first firm in the stack */
	
		printf("Putting first school on the stack\n");
		nextfirm = 1;
		mpoint = 1;
		m[mpoint] = 1;
		mtype[mpoint] = 2;
		fonstack[1] = 1;
	
		printf("Starting to trace the stack\n");
		
		nptraced = 0;
		nftraced = 0;
		lasttenp = 0;
		lasttenf = 0;

		while (mpoint > 0) {
	
			if (trunc((nptraced/npers)*100.0) > lasttenp || trunc((nftraced/nfirm)*100.0) > lasttenf) {
				lasttenp = trunc((nptraced/npers)*100.0);
				lasttenf = trunc((nftraced/nfirm)*100.0);
	
				printf("Progress : %9.0g pct pupils traced, %9.0g pct firms traced\n",lasttenp,lasttenf);
			}
	
			if (g > 1) {
				printf("%9.0g\t", g);
			}
			trace_stack( byp, byf, pg, fg, m, mpoint, mtype, ponstack, fonstack, ptraced, ftraced, pindex,  findex,g, nptraced, nftraced);
			if (mpoint == 0) {
				g = g + 1;
				while (nextfirm < nfirm && fg[nextfirm] != 0) {
					nextfirm = nextfirm + 1;
				}
				if (fg[nextfirm] == 0) {
					mpoint = 1;
					m[mpoint] = nextfirm;
					mtype[mpoint] = 2;
					fonstack[nextfirm] = 1;
				}
			}
		}
	
		printf("Finished processing, adding group data\n");
	
		st_addvar("long", groupvar);
	
		st_view(data, . ,(individualid, unitid,groupvar));
	
		for (j = 1 ; j<=ncells; j++ ) {
			data[j,3] = pg[data[j,1]];
			if (pg[data[j,1]] != fg[data[j,2]]) {
				printf("Error in the output data.\n");
				printf("Observation %9.0g, Pupil %9.0g, School %9.0g, Group of pupil %9.0g, Group of school %j\n",
						j, data[j,1], data[j,2], pg[data[j,1]], fg[data[j,2]]);
				exit(1);
			}
		}
	
		printf("Finished adding groups.\n");
	
	}

	/*

	Name:			check_data()
	
	Purpose:	This function checks whether data is correctly sequenced.
	
	 */
	
	function check_data(real matrix byp, real matrix byf, real scalar ncells) {
		
		real scalar thispers, thisfirm;
	
		real scalar i;
	
		thispers = 1;
		thisfirm = 1;
	
		for ( i=1 ; i <= ncells ; i++ ) {
			if ( byp[i,1] != thispers ) {
				if ( byp[i,1] != thispers+1 ) {
					printf("Error : by pupil file not correctly sorted or missing sequence number\n");
					printf("Previous person : %9.0g , This person : %9.0g , Index in file %9.0g\n", thispers, byp[i,1], i);
					exit(1);
				}
				thispers = thispers + 1 ;
			}
	

			if ( byf[i,2] != thisfirm ) {
				if ( byf[i,2] != thisfirm + 1 ) {
					printf("Error : by school file not correctly sorted or missing sequence number\n");
					printf("Previous school : %9.0g , This school : %9.0g , Index in file %9.0g\n", thisfirm, byf[i,2], i);
					exit(1);
				}
				thisfirm = thisfirm + 1;
			}
		
		}
	
		printf("Data checked - By pupil and by school files correctly sorted and sequenced\n");
	}

	/*
	
	Name: 	trace_stack()
	
	Purpose:	Builds the connex component of the graph of the elements on the stack

	 */

	void trace_stack( real matrix byp, real matrix byf, real vector pg,  real vector fg, 
				real vector m, real scalar mpoint, real vector mtype,
				real vector ponstack, real vector fonstack,
				real vector ptraced,  real vector ftraced,
				real vector pindex,   real vector findex,
				real scalar g, real scalar nptraced, real scalar nftraced) {
	
		real scalar thispers, thisfirm, person, afirm, lower, upper;
	
		if (mtype[mpoint] == 2) { // the element on top of the stack is a firm
			thisfirm = m[mpoint];
			mpoint = mpoint - 1;
			fg[thisfirm] = g;
			ftraced[thisfirm] = 1;
			fonstack[thisfirm] = 0;
			if (thisfirm == 1) {
				lower = 1;
			} else {
				lower =  findex[thisfirm - 1] + 1;
			}
			upper = findex[thisfirm];
			for (person = lower ; person <= upper ; person ++) {
				thispers = byf[person, 1];
				pg[thispers] = g;
				if (ptraced[thispers] == 0 && ponstack[thispers] == 0) {
					nptraced = nptraced + 1;
					mpoint = mpoint + 1;
					m[mpoint] = thispers;
					mtype[mpoint] = 1;
					ponstack[thispers] = 1;
				}
			}
		} else if (mtype[mpoint] == 1) { // the element on top of the stack is a person
			//printf("A person\t");
			thispers = m[mpoint];
			mpoint = mpoint - 1;
			pg[thispers] = g;
			ptraced[thispers] = 1;
			ponstack[thispers] = 0;
			if (thispers == 1) {
				lower = 1;
			} else {
				lower = pindex[thispers - 1] +1;
			}
			upper = pindex[thispers];
			for (afirm = lower; afirm <= upper; afirm++) {
				thisfirm = byp[afirm, 2];
				fg[thisfirm] = g;
				if (ftraced[thisfirm] == 0 && fonstack[thisfirm] == 0) {
					nftraced = nftraced + 1;
					mpoint = mpoint + 1;
					m[mpoint] = thisfirm;
					mtype[mpoint] = 2;
					fonstack[thisfirm] = 1;
				}
			}
		} else {
			printf("Incorrect type, element number %9.0g of the stack, type %9.0g\n",mpoint,mtype[mpoint]);
		}
	
	}


void save1(string scalar var)
{
M=st_data(.,var)
stata("restore")
stata("sort "+"`"+"n"+"'")
/*(void) st_addvar("double",var)*/
st_store(.,var,st_varindex(st_macroexpand("`"+"sample"+"'")),M)
}
end
